caret PackageThe caret package has functions to simplify just about everything in data science/ machine learning.
Install with:
if( !("caret" %in% installed.packages()) ) {
install.packages("caret", dependencies = c("Depends", "Suggests"), repos = "http://cran.us.r-project.org");
}
library("caret");Preprocessing
preProcess(x, method = c("center", "scale"), thresh = 0.95,pcaComp = NULL,na.remove = TRUE,k = 5,knnSummary = mean,outcome = NULL,fudge = .2,numUnique = 3,verbose = FALSE,...)Segmenting Data
createDataPartition(y, times = 1, p = 0.5,list = TRUE, groups = min(5, length(y)))createResample(y, times = 10, list = TRUE)createFolds(y, k = 10, list = TRUE, returnTrain = FALSE)createMultiFolds(y, k = 10, times = 5)createTimeSlices(y, initialWindow, horizon = 1, fixedWindow = TRUE, skip = 0)Training and Testing
train(x, y, method = "rf", preProcess = NULL, ..., weights = NULL, metric = ifelse(is.factor(y), "Accuracy", "RMSE"), maximize = ifelse(metric %in% c("RMSE", "logLoss"), FALSE, TRUE), trControl = trainControl(), tuneGrid = NULL, tuneLength = 3)predict (object, ...)Comparing Models
confusionMatrix(data, reference, positive = NULL, dnn = c("Prediction", "Reference"), prevalence = NULL, mode = "sens_spec", ...)All possible models are trained the same way – by passing a string to caret::train(..., method='string', ...). The full list of method strings is below. For details see http://topepo.github.io/caret/using-your-own-model-in-train.html
names(getModelInfo())## [1] "ada" "AdaBag" "AdaBoost.M1"
## [4] "adaboost" "amdai" "ANFIS"
## [7] "avNNet" "awnb" "awtan"
## [10] "bag" "bagEarth" "bagEarthGCV"
## [13] "bagFDA" "bagFDAGCV" "bam"
## [16] "bartMachine" "bayesglm" "bdk"
## [19] "binda" "blackboost" "blasso"
## [22] "blassoAveraged" "Boruta" "bridge"
## [25] "brnn" "BstLm" "bstSm"
## [28] "bstTree" "C5.0" "C5.0Cost"
## [31] "C5.0Rules" "C5.0Tree" "cforest"
## [34] "chaid" "CSimca" "ctree"
## [37] "ctree2" "cubist" "dda"
## [40] "deepboost" "DENFIS" "dnn"
## [43] "dwdLinear" "dwdPoly" "dwdRadial"
## [46] "earth" "elm" "enet"
## [49] "enpls.fs" "enpls" "evtree"
## [52] "extraTrees" "fda" "FH.GBML"
## [55] "FIR.DM" "foba" "FRBCS.CHI"
## [58] "FRBCS.W" "FS.HGD" "gam"
## [61] "gamboost" "gamLoess" "gamSpline"
## [64] "gaussprLinear" "gaussprPoly" "gaussprRadial"
## [67] "gbm" "gcvEarth" "GFS.FR.MOGUL"
## [70] "GFS.GCCL" "GFS.LT.RS" "GFS.THRIFT"
## [73] "glm" "glmboost" "glmnet"
## [76] "glmStepAIC" "gpls" "hda"
## [79] "hdda" "hdrda" "HYFIS"
## [82] "icr" "J48" "JRip"
## [85] "kernelpls" "kknn" "knn"
## [88] "krlsPoly" "krlsRadial" "lars"
## [91] "lars2" "lasso" "lda"
## [94] "lda2" "leapBackward" "leapForward"
## [97] "leapSeq" "Linda" "lm"
## [100] "lmStepAIC" "LMT" "loclda"
## [103] "logicBag" "LogitBoost" "logreg"
## [106] "lssvmLinear" "lssvmPoly" "lssvmRadial"
## [109] "lvq" "M5" "M5Rules"
## [112] "manb" "mda" "Mlda"
## [115] "mlp" "mlpML" "mlpSGD"
## [118] "mlpWeightDecay" "mlpWeightDecayML" "multinom"
## [121] "nb" "nbDiscrete" "nbSearch"
## [124] "neuralnet" "nnet" "nnls"
## [127] "nodeHarvest" "oblique.tree" "OneR"
## [130] "ordinalNet" "ORFlog" "ORFpls"
## [133] "ORFridge" "ORFsvm" "ownn"
## [136] "pam" "parRF" "PART"
## [139] "partDSA" "pcaNNet" "pcr"
## [142] "pda" "pda2" "penalized"
## [145] "PenalizedLDA" "plr" "pls"
## [148] "plsRglm" "polr" "ppr"
## [151] "protoclass" "pythonKnnReg" "qda"
## [154] "QdaCov" "qrf" "qrnn"
## [157] "randomGLM" "ranger" "rbf"
## [160] "rbfDDA" "Rborist" "rda"
## [163] "relaxo" "rf" "rFerns"
## [166] "RFlda" "rfRules" "ridge"
## [169] "rlda" "rlm" "rmda"
## [172] "rocc" "rotationForest" "rotationForestCp"
## [175] "rpart" "rpart1SE" "rpart2"
## [178] "rpartCost" "rpartScore" "rqlasso"
## [181] "rqnc" "RRF" "RRFglobal"
## [184] "rrlda" "RSimca" "rvmLinear"
## [187] "rvmPoly" "rvmRadial" "SBC"
## [190] "sda" "sddaLDA" "sddaQDA"
## [193] "sdwd" "simpls" "SLAVE"
## [196] "slda" "smda" "snn"
## [199] "sparseLDA" "spikeslab" "spls"
## [202] "stepLDA" "stepQDA" "superpc"
## [205] "svmBoundrangeString" "svmExpoString" "svmLinear"
## [208] "svmLinear2" "svmLinear3" "svmLinearWeights"
## [211] "svmLinearWeights2" "svmPoly" "svmRadial"
## [214] "svmRadialCost" "svmRadialSigma" "svmRadialWeights"
## [217] "svmSpectrumString" "tan" "tanSearch"
## [220] "treebag" "vbmpRadial" "vglmAdjCat"
## [223] "vglmContRatio" "vglmCumulative" "widekernelpls"
## [226] "WM" "wsrf" "xgbLinear"
## [229] "xgbTree" "xyf"
Use caret::createDataPartition( y=srcData, times=1, p=0.5, list=TRUE, groups=min(5, length(y)) ):
library(caret); library(kernlab); data(spam);
# creates a numeric vector that can subset from spam
inTrain<-createDataPartition(y=spam$type, p=0.75, list=FALSE)
training <- spam[inTrain, ]
# subsets rows NOT indexed by inTrain
testing <- spam[-inTrain, ]
dim(inTrain)## [1] 3451 1
# The training set
head( spam[inTrain, 1:8 ]); ## make address all num3d our over remove internet
## 1 0.00 0.64 0.64 0 0.32 0.00 0.00 0.00
## 2 0.21 0.28 0.50 0 0.14 0.28 0.21 0.07
## 4 0.00 0.00 0.00 0 0.63 0.00 0.31 0.63
## 6 0.00 0.00 0.00 0 1.85 0.00 0.00 1.85
## 7 0.00 0.00 0.00 0 1.92 0.00 0.00 0.00
## 9 0.15 0.00 0.46 0 0.61 0.00 0.30 0.00
# The test set
head(spam[-inTrain, 1:8]);## make address all num3d our over remove internet
## 3 0.06 0.00 0.71 0 1.23 0.19 0.19 0.12
## 5 0.00 0.00 0.00 0 0.63 0.00 0.31 0.63
## 8 0.00 0.00 0.00 0 1.88 0.00 0.00 1.88
## 15 0.00 0.00 1.42 0 0.71 0.35 0.00 0.35
## 16 0.00 0.42 0.42 0 1.27 0.00 0.42 0.00
## 21 0.00 0.00 0.00 0 0.00 0.00 0.00 0.00
Note the row numbers
set.seed(32343)
# trains a generalized linear model
modelFit <- train(type ~., data = training, method = 'glm')
# view summary of model
modelFit## Generalized Linear Model
##
## 3451 samples
## 57 predictor
## 2 classes: 'nonspam', 'spam'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9198818 0.8317119
##
##
# view parameters (weights/theta) of final model
modelFit$finalModel##
## Call: NULL
##
## Coefficients:
## (Intercept) make address
## -1.617e+00 -5.494e-01 -1.528e-01
## all num3d our
## 1.429e-01 3.733e+00 5.066e-01
## over remove internet
## 9.540e-01 1.883e+00 1.234e+00
## order mail receive
## 1.109e+00 1.423e-01 2.384e-01
## will people report
## -1.205e-01 -2.180e-01 7.444e-02
## addresses free business
## 9.789e-01 9.337e-01 1.328e+00
## email you credit
## 9.268e-02 1.011e-01 1.426e+00
## your font num000
## 1.703e-01 1.655e-01 1.756e+00
## money hp hpl
## 3.088e-01 -2.713e+00 -6.414e-01
## george num650 lab
## -2.036e+01 5.900e-01 -2.029e+00
## labs telnet num857
## -3.933e-01 -9.324e-02 1.108e+00
## data num415 num85
## -1.068e+00 1.283e+00 -1.629e+00
## technology num1999 parts
## 1.210e+00 8.426e-02 1.833e+00
## pm direct cs
## -7.681e-01 -3.524e-01 -5.684e+02
## meeting original project
## -3.583e+00 -1.929e+00 -1.578e+00
## re edu table
## -9.668e-01 -1.758e+00 -2.806e+00
## conference charSemicolon charRoundbracket
## -3.374e+00 -1.325e+00 -7.183e-01
## charSquarebracket charExclamation charDollar
## -5.464e-01 3.031e-01 5.312e+00
## charHash capitalAve capitalLong
## 2.525e+00 7.188e-02 7.347e-03
## capitalTotal
## 1.131e-03
##
## Degrees of Freedom: 3450 Total (i.e. Null); 3393 Residual
## Null Deviance: 4628
## Residual Deviance: 1288 AIC: 1404
predictions <- predict(object = modelFit, newdata = testing)
# produces a factor vector of class labels, 'spam' or 'nonspam'
predictions[1:16]## [1] spam spam spam spam spam nonspam spam nonspam
## [9] spam nonspam nonspam nonspam spam spam spam spam
## Levels: nonspam spam
confusionMatrix(data = predictions, reference = testing$type)## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 662 56
## spam 35 397
##
## Accuracy : 0.9209
## 95% CI : (0.9037, 0.9358)
## No Information Rate : 0.6061
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.8329
## Mcnemar's Test P-Value : 0.03603
##
## Sensitivity : 0.9498
## Specificity : 0.8764
## Pos Pred Value : 0.9220
## Neg Pred Value : 0.9190
## Prevalence : 0.6061
## Detection Rate : 0.5757
## Detection Prevalence : 0.6243
## Balanced Accuracy : 0.9131
##
## 'Positive' Class : nonspam
##
library(caret); library(kernlab); data(spam);
# creates a numeric vector that can subset from spam
inTrain <- createDataPartition(y=spam$type, p=0.75, list=FALSE)
training <- spam[inTrain, ]
testing <- spam[-inTrain, ]dim(training); dim(testing)## [1] 3451 58
## [1] 1150 58
k = the number of foldsreturnTrain= specifies whether the folds are training or test folds
TRUE, FALSE}list= determines the return structure data type
list, vector, matrix}set.seed(32323)
# split data into 10 groups with random dropout in each
# returns a list of folds
# each fold is a numeric vector of row nums from spam$type
folds <- createFolds(y=spam$type, k=10, list=TRUE, returnTrain=TRUE) # vs returnTrain=FALSE for test set
# display num of examples in each fold, source had 4601 rows
sapply(folds, length)## Fold01 Fold02 Fold03 Fold04 Fold05 Fold06 Fold07 Fold08 Fold09 Fold10
## 4141 4140 4141 4142 4140 4142 4141 4141 4140 4141
# view row dropout in various folds
folds[[1]][1:20]; folds[[2]][1:20]## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## [1] 1 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 22
# use folds to index data sets via
trn2 <- spam$type[folds[[2]]]]
val2 <- spam$type[-folds[[2]]]or
tst <- createFolds(y=spam$type, k=10, list=TRUE, returnTrain=FALSE) # vs returnTrain=TRUE for training setReturn the training set via something like df[-tst, :]
set.seed(32323)
# fill each fold with random sample (with replacement) of rows
folds <- createResample(y = spam$type, times = 10, list = TRUE)
# view sizes of resampled folds, note the random collection of rows
sapply(folds, length)## Resample01 Resample02 Resample03 Resample04 Resample05 Resample06
## 4601 4601 4601 4601 4601 4601
## Resample07 Resample08 Resample09 Resample10
## 4601 4601 4601 4601
# note the repeated rows
folds[[1]][1:20]## [1] 1 2 3 3 3 5 5 7 8 12 12 12 13 13 13 14 14 15 16 16
initialWindow= rows in the training sethorizon= the next \(n\) continuous rows (test set)set.seed(32323)
tme <- 1:1000
folds <- createTimeSlices(y = tme, initialWindow = 20, horizon = 10)
# what are the factors in each of our time slices?
names(folds)## [1] "train" "test"
Recall the generalized linear model trained on the spam dataset:
library(caret); library(kernlab); data(spam);
inTrain <- createDataPartition(y=spam$type, p=0.75, list=FALSE)
training <- spam[inTrain, ]
testing <- spam[-inTrain, ]
modelFit <- train(type ~., data = training, method = 'glm')The caret::train() has a number of options to control how training is done. (View any default method parameters with base::args(functionName.default))
library(caret)## Loading required package: lattice
## Loading required package: ggplot2
args(train.default)## function (x, y, method = "rf", preProcess = NULL, ..., weights = NULL,
## metric = ifelse(is.factor(y), "Accuracy", "RMSE"), maximize = ifelse(metric %in%
## c("RMSE", "logLoss"), FALSE, TRUE), trControl = trainControl(),
## tuneGrid = NULL, tuneLength = 3)
## NULL
preProcess = NULL, ... takes a string vector of preprocessing methods
weights = NULL takes a numeric vector of case weights, to manually up or down-weight observationsmetric = ... a string that specifies the cost function to minimize; by default is RMSE for numerics and fractional accuracy for factors
trControl = trainControl(...) controls additional options
?trainControlargs(trainControl)## function (method = "boot", number = ifelse(grepl("cv", method),
## 10, 25), repeats = ifelse(grepl("cv", method), 1, number),
## p = 0.75, search = "grid", initialWindow = NULL, horizon = 1,
## fixedWindow = TRUE, verboseIter = FALSE, returnData = TRUE,
## returnResamp = "final", savePredictions = FALSE, classProbs = FALSE,
## summaryFunction = defaultSummary, selectionFunction = "best",
## preProcOptions = list(thresh = 0.95, ICAcomp = 3, k = 5),
## sampling = NULL, index = NULL, indexOut = NULL, indexFinal = NULL,
## timingSamps = 0, predictionBounds = rep(FALSE, 2), seeds = NA,
## adaptive = list(min = 5, alpha = 0.05, method = "gls", complete = TRUE),
## trim = FALSE, allowParallel = TRUE)
## NULL
method = boot the resampling method
number = the number of folds / resampling iterations (convergence limit??)repeats = the number repetitions for k-fold cross validationinitialWindow = size of time series training sethorizon = size of time series test setsavePredictions = flag to return predictions from each iteration of model
"all", "none", "final"} = {TRUE, FALSE, "final"}allowParallel = flag to enable parallel processingseeds = NA initializes the random number generator to a determined state
Plots are your friends. Git gud at them. For this section, we use example data from ISLR Package, the companion to the book Introduction to Statistical Learning with R.
Install with:
if( !("ISLR" %in% installed.packages()) ) {
install.packages("ISLR", repos = "http://cran.us.r-project.org");
}library("ISLR"); library(ggplot2);
data(Wage)
summary(Wage)## year age sex maritl
## Min. :2003 Min. :18.00 1. Male :3000 1. Never Married: 648
## 1st Qu.:2004 1st Qu.:33.75 2. Female: 0 2. Married :2074
## Median :2006 Median :42.00 3. Widowed : 19
## Mean :2006 Mean :42.41 4. Divorced : 204
## 3rd Qu.:2008 3rd Qu.:51.00 5. Separated : 55
## Max. :2009 Max. :80.00
##
## race education region
## 1. White:2480 1. < HS Grad :268 2. Middle Atlantic :3000
## 2. Black: 293 2. HS Grad :971 1. New England : 0
## 3. Asian: 190 3. Some College :650 3. East North Central: 0
## 4. Other: 37 4. College Grad :685 4. West North Central: 0
## 5. Advanced Degree:426 5. South Atlantic : 0
## 6. East South Central: 0
## (Other) : 0
## jobclass health health_ins logwage
## 1. Industrial :1544 1. <=Good : 858 1. Yes:2083 Min. :3.000
## 2. Information:1456 2. >=Very Good:2142 2. No : 917 1st Qu.:4.447
## Median :4.653
## Mean :4.654
## 3rd Qu.:4.857
## Max. :5.763
##
## wage
## Min. : 20.09
## 1st Qu.: 85.38
## Median :104.92
## Mean :111.70
## 3rd Qu.:128.68
## Max. :318.34
##
Set aside a test set (and validation set) before beginning exploratory analysis.
inTrain <- createDataPartition(y = Wage$wage, p = 0.7, list = FALSE)
training <- Wage[inTrain,]
testing <- Wage[-inTrain,]
dim(training); dim(testing)## [1] 2102 12
## [1] 898 12
Using caret::featurePlot()
# choose 3 features
mySet <- training[, c('age', 'education', 'jobclass')]
# plot wage as a function of features (4x4 = 16)
featurePlot(x=mySet, y=training$wage, plot='pairs')ggplot(data=training) + aes(x=age, y=wage) + geom_point() Note the natural vertical segregation. How to investigate this further?
Answer: color the scatter by an additional feature:
ggplot(data=training) + aes(x=age, y=wage, color=jobclass) + geom_point() Very few industrial jobs are in the upper group.
It is easy to do simple regressions inside a plot, for easy viewing. Oddly, it’s actually very difficult to do your own regression, then add that to a plot. Proceed accordingly!
g <- ggplot(data=training) + aes(x=age, y=wage, color=education) + geom_point()
# add linear regression: wage = f(age)
g <- g + geom_smooth(method = "lm", formula = y~x)
print(g) Regression lines have been added to plot
The method here is to pseudo-manually cut the data with Hmisc::cut2(), then plot the resulting quantiles.
if( !("Hmisc" %in% installed.packages()) ) {
install.packages("Hmisc", repos = "http://cran.us.r-project.org");
}library(Hmisc)## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
cutWage <- cut2(training$wage, g=3)
table(cutWage)## cutWage
## [ 20.1, 93.1) [ 93.1,118.9) [118.9,314.3]
## 713 708 681
t1 <- table(cutWage, training$jobclass)
t1##
## cutWage 1. Industrial 2. Information
## [ 20.1, 93.1) 449 264
## [ 93.1,118.9) 361 347
## [118.9,314.3] 275 406
# a proportional table
prop.table(x = t1, margin = 1) # margin=dimension##
## cutWage 1. Industrial 2. Information
## [ 20.1, 93.1) 0.6297335 0.3702665
## [ 93.1,118.9) 0.5098870 0.4901130
## [118.9,314.3] 0.4038179 0.5961821
Plot this table to look for trends
library(gridExtra)##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:Hmisc':
##
## combine
g1 <- ggplot(data=training) + aes(x=cutWage, y=age, fill=cutWage) + geom_boxplot()
# view points on top of boxplot
g2 <- ggplot(data=training) + aes(x=cutWage, y=age, fill=cutWage) + geom_boxplot() + geom_jitter()
grid.arrange(g1,g2, ncol=2)For continuous predictors
ggplot(data=training) + aes(x=cutWage, color=education) + geom_density()All of the above are ways to examine properties of your data set, but don’t use your test set for exploration.
You’re always looking for
After plotting the variables up front, it’s time to prep the data set for training.
Consider the kernlab::spam data set one more time:
library(caret); library(kernlab); data(spam);
# creates a numeric vector that can subset from spam
inTrain<-createDataPartition(y=spam$type, p=0.75, list=FALSE)
training <- spam[inTrain, ]
testing <- spam[-inTrain, ]
hist(training$capitalAve, main="", xlab="ave. capital run length") This is the distribution of capital letter sequence length between spam/nonspam. This variable is highly skewed, and will bone most learning algorithms. We can redress this by shifting/stretching/squashing the input data and writing the result to an R::caret Preprocess Object. That object becomes the training inputs to a hypothesis function, and we pass it to the
caret::train(type= ,data=preObj, method="string") and caret::predict(object=preObj, newdata=groundTruth) functions.
preObj <- preProcess(object = inputSet , method = "" )predict(object = preObj, newdata = groundTruth)One way to fix model-boning is by doing a statistical z-transform on the data:
testCapAve <- testing$capitalAve
# z-transform based on training set distribution
testCapAveS <- (testCapAve - mean(training$capitalAve)) / sd(training$capitalAve)
hist(testCapAveS, xlab="ave capital run length deviation")Another way to do this is with the caret::preProcess() function:
# col 58 is the ground-truth outcome
preObj <- preProcess( training[,-58], method=c("center", "scale") )
# pass the Preprocess object into the predict() method
trainCapAveS <- predict(preObj, training[,-58])$capitalAve
hist(testCapAveS, xlab="ave capital run length deviation")To z-transform the corresponding test set, you would create a Preprocess Object from the training set, then pass that object to predict(), along with the testing set:
preObj <- preProcess( training[,-58], method=c("center", "scale") )
# must pass subset of same dims to preProcess() and predict(): ex train[-58] vs test[-58]
trainCapAveS <- predict(preObj, training[,-58])$capitalAve
testCapAveS <- predict(preObj, testing[,-58])$capitalAveYou could also pass the preProcess() function as an argument to train():
set.seed(32343)
modelFit <- train(type ~., data=training, method="glm", preProcess=c("center", "scale"))
modelFit## Generalized Linear Model
##
## 3451 samples
## 57 predictor
## 2 classes: 'nonspam', 'spam'
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9116024 0.8163398
##
##
Other preprocessing transforms are avaialable. The Box-Cox is a power transform takes any continuous variable and forces it to fit a normal distribution by introducing a new parameter \(\lambda\)
\[ \mbox{for } y \geq 0: ~~ y(\lambda) = \left\{\begin{matrix} \frac{ y^{\lambda}-1 }{\lambda}, & \mbox{if } \lambda \neq 0\\ log(y), & \mbox{if } \lambda = 0 \end{matrix}\right. \]
preObj <- preProcess(training[,-58], method=c("BoxCox"))
trainCapAveS <- predict(object = preObj, newdata = training[,-58])$capitalAve
par(mfrow=c(1,2))
hist(trainCapAveS)
qqnorm(trainCapAveS)Imputing is the process of assigning values to NAs in the data set. Imputing may use any number of interpolation methods. A common method is k nearest neighbors, which averages the values surrounding each NA.
This is super easy, just add "knnImpute" to preProcess(..., method="..."):
set.seed(13343)
# introduce NAs
training$capAve <- training$capitalAve # copy a column
selectNA <- rbinom(dim(training)[1], size=1, prob=.5) == 1 # make a boolean vector
training$capAve[selectNA] <- NA # overwrite some values with NA
# Impute and standardize
preObj <- preProcess(training[,-58], method=c("knnImpute"))
capAve <- predict(preObj, training[,-58])$capAve
# Standardize the values
capAveTruth <- training$capitalAve
capAveTruth <- (capAveTruth - mean(capAveTruth)) / sd(capAveTruth)
quantile(capAve - capAveTruth)## 0% 25% 50% 75% 100%
## -6.552565113 -0.004786531 0.007954623 0.014351073 0.716373528
quantile((capAve - capAveTruth)[selectNA])## 0% 25% 50% 75% 100%
## -6.26513752 -0.01168132 0.01008928 0.02320144 0.71637353
quantile((capAve - capAveTruth)[!selectNA])## 0% 25% 50% 75% 100%
## -6.552565113 -0.001711543 0.007131843 0.011186118 0.014763419
Be mindful that if you preprocess training data, you cannot preprocess test data when predicting; you must reuse the preprocessed training data. The reason is that any transformation creates new features, and creating new features on a test set will skew the predictions in unpredictable ways.
A covariate is yet another name for a model feature. In the real world, not all data variables make it into the model. Covariates (or features, or predictors) have the implicit meaning that they may have been subset from a larger group, and are intended for model inclusion.
Covariates are created in 2 levels, which sort of correspond to in-model and out-model. The goal is to include the most data in the fewest number of features (e.g. maximize data density).
caret functions to simplify thisCreating covariates is delicate balancing act between information density (summarization) and information loss. It requires substantial application knowledge and domain expertise. Including more features tends toward overfitting (high bias), which can be corrected by regularization.
Dummy Variables are a stupid way to describe the process of converting factor levels into feature variables. If there is a factor variable with 2 levels, create a new feature for each level. Each of those features will have a boolean value for each example. This process expands the data space, widens the data set, and is also known as one-hot encoding.
library(ISLR); library(caret); data(Wage);
inTrain <- createDataPartition(y=Wage$wage, p=0.7, list=FALSE)
training <- Wage[inTrain,]; testing <- Wage[-inTrain,]Explore the data
table(training$jobclass)##
## 1. Industrial 2. Information
## 1051 1051
Create 2 new variables for these classes with caret::dummyVars(formula = , data = , ...)
dummies <- dummyVars(formula = wage~jobclass, data = training)
str(dummies)## List of 9
## $ call : language dummyVars.default(formula = wage ~ jobclass, data = training)
## $ form :Class 'formula' language wage ~ jobclass
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## $ vars : chr [1:2] "wage" "jobclass"
## $ facVars : chr "jobclass"
## $ lvls :List of 1
## ..$ jobclass: chr [1:2] "1. Industrial" "2. Information"
## $ sep : chr "."
## $ terms :Classes 'terms', 'formula' language wage ~ jobclass
## .. ..- attr(*, "variables")= language list(wage, jobclass)
## .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:2] "wage" "jobclass"
## .. .. .. ..$ : chr "jobclass"
## .. ..- attr(*, "term.labels")= chr "jobclass"
## .. ..- attr(*, "order")= int 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(wage, jobclass)
## .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "factor"
## .. .. ..- attr(*, "names")= chr [1:2] "wage" "jobclass"
## $ levelsOnly: logi FALSE
## $ fullRank : logi FALSE
## - attr(*, "class")= chr "dummyVars"
head(predict(object = dummies, newdata = training))## jobclass.1. Industrial jobclass.2. Information
## 86582 0 1
## 161300 1 0
## 155159 0 1
## 11443 0 1
## 376662 0 1
## 450601 1 0
The training set now has a boolean flag for each person and job class.
Some features, both qualitative, quantitative, and boolean, have very little variability, and won’t add much information to a model. Finding numeric features of this kind is easy with caret::nearZeroVar(x= , freqCut = 95/5, uniqueCut = 10) or caret::nzv()
For nearZeroVar(): if saveMetrics = FALSE, returns a vector of integers corresponding to the column positions of the problematic predictors. If saveMetrics = TRUE, a data frame with columns:
nsv <- nearZeroVar(x = training, saveMetrics = TRUE)
nsv## freqRatio percentUnique zeroVar nzv
## year 1.037356 0.33301618 FALSE FALSE
## age 1.027027 2.85442436 FALSE FALSE
## sex 0.000000 0.04757374 TRUE TRUE
## maritl 3.272931 0.23786870 FALSE FALSE
## race 8.938776 0.19029496 FALSE FALSE
## education 1.389002 0.23786870 FALSE FALSE
## region 0.000000 0.04757374 TRUE TRUE
## jobclass 1.000000 0.09514748 FALSE FALSE
## health 2.468647 0.09514748 FALSE FALSE
## health_ins 2.352472 0.09514748 FALSE FALSE
## logwage 1.061728 19.17221694 FALSE FALSE
## wage 1.061728 19.17221694 FALSE FALSE
Recall that creating polynomial features is a simple way to improve model fit. Again, this is super easy with the splines package splines::bs function.
From the training$age variable, we want to create \(\{age, age^2, age^3 \}\):
library(splines)
# uses B-splines with 3 dof
bsBasis <- bs(training$age, df=3)
head(bsBasis, 3L)## 1 2 3
## [1,] 0.2368501 0.02537679 0.000906314
## [2,] 0.4163380 0.32117502 0.082587862
## [3,] 0.4308138 0.29109043 0.065560908
For computational efficiency, in practice the modeling code should be separated from the plotting code.
lm1 <- lm(wage~bsBasis, data=training)
# the spline will be the y-arg of a geom_line()
splineLine <- predict(object = lm1, newdata = training)
g <- ggplot(data=training) + aes(x=age, y=wage) + geom_point()
g <- g + geom_line(aes(x=age, y=splineLine), color='red', lwd=2)
print(g)caret:gam() also works, but is more complicatedPrincipal Components Analysis will segment out variables that are highly correlated.
Once again, the kernlab::spam dataset:
library(caret); library(kernlab); data(spam);
# creates a numeric vector that can subset from spam
inTrain<-createDataPartition(y=spam$type, p=0.75, list=FALSE)
training <- spam[inTrain, ]
testing <- spam[-inTrain, ]
M <- abs(cor(x = training[,-58]))
diag(M) <- 0 # remove autocorrelations
which(M > 0.9, arr.ind=TRUE) # which vars are more than 90% correlated?## row col
## num415 34 32
## num857 32 34
Two features are highly correlated: num415, num857. Double check to be sure:
# index by col num if you don't have the names
plot(spam[,34], spam[,32]) A straight line with \(slope \approx 1\); this is almost perfect correlation. A weighted combination of these features might capture most of the variance while removing all the correlation. Extending this idea, we want to find the smallest matrix \(M\) (subset) that captures the most variability (best explains) the original set. This matrix can be found via SVD/PCA
Takes an original matrix \(X\) and decomposes it into a product of 3 matrices such that
\[X = UDV^T\]
The Principal Components of \(X\) capture the most variability of \(X\) in descending cardinality order. Conveniently, if the input variables have been scaled and mean-shifted (z-transformed), the principal components are equal to the columns of \(V\) (often also \(U\)).
Principal Components are very easy to find with stats::prcomp()
typeColor <- ((spam$type=="spam")*1 + 1)
prComp <- prcomp(x = log10(spam[ , -58]+1) ) # Gaussify to correct data skew (log10(...))
ggplot() + aes(prComp$x[,1], prComp$x[,2], color = typeColor) + geom_point() +
xlab("PC1") + ylab("PC2") The principal components are pretty distinct.
During preprocessing (the preProcess() method), use method="pca"):
# log10(...) Gaussifies the input for better PCA results
preProc <- preProcess(log10(spam[,-58]+1), method="pca", pcaComp=2)
# the principal components matrix is actually created with predict()
spamPC <- predict(object = preProc, newdata = log10(spam[,-58]+1))
ggplot() + aes(spamPC[,1], spamPC[,2], color=typeColor) + geom_point() +
xlab("PC1") + ylab("PC2") Principal Components still pretty distinct.
From the previous example, spamPC is the training set, the first 2 principal components of kernlab::spam.
# scales & shifts for pca
preProc <- preProcess(log10(training[,-58]+1), method="pca", pcaComp=2)
# predict() actually does the PCA, returning the PCM
trainPC <- predict(object = preProc, newdata = log10(training[,-58]+1))
trainPC$type <- training$type
modelFit <- train(type ~., data=trainPC, method="glm")
# modelFit <- train(type ~., data = training, method = 'glm')
modelFit$results## parameter Accuracy Kappa AccuracySD KappaSD
## 1 none 0.9056476 0.8011514 0.005884655 0.01226668
# make some predictions with this model
testPC <- predict(preProc, log10(testing[,-58]+1))
confusionMatrix(testing$type, predict(object=modelFit, newdata=testPC))## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 645 52
## spam 73 380
##
## Accuracy : 0.8913
## 95% CI : (0.8719, 0.9087)
## No Information Rate : 0.6243
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.7705
## Mcnemar's Test P-Value : 0.07364
##
## Sensitivity : 0.8983
## Specificity : 0.8796
## Pos Pred Value : 0.9254
## Neg Pred Value : 0.8389
## Prevalence : 0.6243
## Detection Rate : 0.5609
## Detection Prevalence : 0.6061
## Balanced Accuracy : 0.8890
##
## 'Positive' Class : nonspam
##
A simpler form for doing the same thing is
modelFit <- train(type ~ ., method="glm", preProcess="pca", data=training)
confusionMatrix(testing$type, predict(modelFit, testing))## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 658 39
## spam 50 403
##
## Accuracy : 0.9226
## 95% CI : (0.9056, 0.9374)
## No Information Rate : 0.6157
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8372
## Mcnemar's Test P-Value : 0.2891
##
## Sensitivity : 0.9294
## Specificity : 0.9118
## Pos Pred Value : 0.9440
## Neg Pred Value : 0.8896
## Prevalence : 0.6157
## Detection Rate : 0.5722
## Detection Prevalence : 0.6061
## Balanced Accuracy : 0.9206
##
## 'Positive' Class : nonspam
##
In conclusion, transform data & remove outliers before doing this
Linear regressions are simple and easy, but limited to linear settings. With one variable (univariate), just create a line of form \(y = mx + b\). In most machine learning settings, the parameters \(m\) and \(b\) are called weights, and notated with a single symbol: \(\{w_0, w_1 \}\), \(\{\theta_0, \theta_1\}\), etc.
In R, univariate regression is done with caret::train() or stats::predict()
In this example, 1 output (eruption duration) is modeled by 1 input (time between eruptions).
# data in caret::faithful
library(caret); data(faithful); library(ggplot2);
set.seed(333)
inTrain <- createDataPartition(y=faithful$waiting, p=0.5, list=FALSE)
trainFaith <- faithful[inTrain,]
testFaith <- faithful[-inTrain,]
head(trainFaith)## eruptions waiting
## 1 3.600 79
## 3 3.333 74
## 5 4.533 85
## 6 2.883 55
## 7 4.700 88
## 8 3.600 85
ggplot(data=trainFaith) + aes(x=waiting, y=eruptions) + geom_point(color='blue') +
geom_abline(slope = 0.07, intercept = -1.5, linetype="dashed") +
xlab("Waiting") + ylab("Duration") The relationship looks pretty linear, \(Duration \approx m \times Waiting + b\)
We can find the best fit line more precisely with stats::lm:
library(stats)
lm1 <- lm(eruptions ~ waiting, data=trainFaith)
summary(lm1)##
## Call:
## lm(formula = eruptions ~ waiting, data = trainFaith)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.26990 -0.34789 0.03979 0.36589 1.05020
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.792739 0.227869 -7.867 1.04e-12 ***
## waiting 0.073901 0.003148 23.474 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.495 on 135 degrees of freedom
## Multiple R-squared: 0.8032, Adjusted R-squared: 0.8018
## F-statistic: 551 on 1 and 135 DF, p-value: < 2.2e-16
The slope and intercepts are in Coefficients:; slope = waiting 0.073901, intercept = (Intercept) -1.792739.
An easy way to access the parameters is:
# returns named vectors
lm1$coefficients
# or
coef(lm1)## (Intercept) waiting
## -1.79273941 0.07390058
## (Intercept) waiting
## -1.79273941 0.07390058
Making a single prediction is simply:
# new imput = 80
coef(lm1)[1] + coef(lm1)[2]*80## (Intercept)
## 4.119307
Ignore the (Intercept) label. It’s a bug.
Single predictions are even easier with stats::predict:
# takes a linear model object, and data frame of new inputs
predict(object=lm1, data.frame(waiting=80) )## 1
## 4.119307
You can access all the predictions of this model with:
lm1$fitted[1:5] # returns a numeric vector## 1 3 5 6 7
## 4.045407 3.675904 4.488810 2.271793 4.710512
This makes plotting easy:
ggplot(data=trainFaith) + aes(x=waiting, y=eruptions) + geom_point(color='blue') +
geom_line(aes(x=waiting, y=lm1$fitted), color="red", lwd=1.5) +
xlab("Waiting") + ylab("Duration")We use stats::predict and the existing linear model, built on the training data. When predicting from the test set, we pass the test set into predict() as an argument.
library(gridExtra)
# training set
g1 <- ggplot(data=trainFaith) + aes(x=waiting, y=eruptions) + geom_point(color='blue') +
geom_line( aes( x=waiting, y=predict(lm1) ) ) +
xlab("Waiting") + ylab("Duration") + ggtitle("Training Set")
print(g1)# test set
g2 <- ggplot(data=testFaith) + aes(x=waiting, y=eruptions) + geom_point(color='blue') +
geom_line( aes( x=waiting, y=predict(lm1, newdata = testFaith ) ) ) +
xlab("Waiting") + ylab("Duration") + ggtitle("Test Set")
grid.arrange(g1, g2, ncol=2) Note the differing arguments in
ggplot( data = ) and y = predict(). The test fit is not quite as good, which makes sense.
# training
sum((lm1$fitted - trainFaith$eruptions)^2)/length(trainFaith$eruptions)## [1] 0.2414883
# test - substitute predict(...) for lm1$fitted
sum(( predict(lm1, newdata=testFaith) - trainFaith$eruptions)^2)/length(testFaith$eruptions)## Warning in predict(lm1, newdata = testFaith) - trainFaith$eruptions: longer
## object length is not a multiple of shorter object length
## [1] 2.460094
# training
sqrt(sum((lm1$fitted - trainFaith$eruptions)^2)/length(trainFaith$eruptions))## [1] 0.4914146
# test - substitute predict(...) for lm1$fitted
sqrt(sum(( predict(lm1, newdata=testFaith) - trainFaith$eruptions)^2)/length(testFaith$eruptions))## Warning in predict(lm1, newdata = testFaith) - trainFaith$eruptions: longer
## object length is not a multiple of shorter object length
## [1] 1.568469
stats::predict() can calculate intervals automatically, if you pass the interval=" " argument. The resulting prediction object has 3 columns: {"fit", "lwr", "upr"} containing the best fit predictions, as well as the corresponding plus/minus one (??) standard deviation boundaries. Unfortunately, the columnes can’t be accessed with pred1$fit; they must be indexed via pred1[rows, "fit"].
pred1 <- predict(lm1, newdata=testFaith, interval="prediction")
ord <- order(testFaith$waiting)
ggplot(data=testFaith) + aes(x=waiting, y=eruptions) + geom_point(color="blue") +
geom_line( aes(x=waiting[ord], y=pred1[ord,"lwr"]), color="red" ) +
geom_line( aes(x=waiting[ord], y=pred1[ord,"fit"]), lwd=1.2) +
geom_line( aes(x=waiting[ord], y=pred1[ord,"upr"]), color="red" ) This method passes a caret linear model object to stats::predict(), which calculates the confidence interals.
# get a glm with caret::train()
modFit <- train(eruptions ~ waiting, data=trainFaith, method="lm")
summary(modFit$finalModel)##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.26990 -0.34789 0.03979 0.36589 1.05020
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.792739 0.227869 -7.867 1.04e-12 ***
## waiting 0.073901 0.003148 23.474 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.495 on 135 degrees of freedom
## Multiple R-squared: 0.8032, Adjusted R-squared: 0.8018
## F-statistic: 551 on 1 and 135 DF, p-value: < 2.2e-16
# pass finalModel to stats::predict()
pred1 <- predict(modFit$finalModel, newdata=testFaith, interval="prediction")
ord <- order(testFaith$waiting)
# plot it!!!
ggplot(data=testFaith) + aes(x=waiting, y=eruptions) + geom_point(color="blue") +
geom_line( aes(x=waiting[ord], y=pred1[ord,"lwr"]), color="red" ) +
geom_line( aes(x=waiting[ord], y=pred1[ord,"fit"]), lwd=1.2) +
geom_line( aes(x=waiting[ord], y=pred1[ord,"upr"]), color="red" ) Taking another look at the ISLR::Wage data set reveals multiple features that seem important.
library(ISLR); library(ggplot2); library(caret);
data(Wage)
# subset out the variable we're trying to predict
Wage <- subset(Wage, select= -c(logwage))
summary(Wage)## year age sex maritl
## Min. :2003 Min. :18.00 1. Male :3000 1. Never Married: 648
## 1st Qu.:2004 1st Qu.:33.75 2. Female: 0 2. Married :2074
## Median :2006 Median :42.00 3. Widowed : 19
## Mean :2006 Mean :42.41 4. Divorced : 204
## 3rd Qu.:2008 3rd Qu.:51.00 5. Separated : 55
## Max. :2009 Max. :80.00
##
## race education region
## 1. White:2480 1. < HS Grad :268 2. Middle Atlantic :3000
## 2. Black: 293 2. HS Grad :971 1. New England : 0
## 3. Asian: 190 3. Some College :650 3. East North Central: 0
## 4. Other: 37 4. College Grad :685 4. West North Central: 0
## 5. Advanced Degree:426 5. South Atlantic : 0
## 6. East South Central: 0
## (Other) : 0
## jobclass health health_ins
## 1. Industrial :1544 1. <=Good : 858 1. Yes:2083
## 2. Information:1456 2. >=Very Good:2142 2. No : 917
##
##
##
##
##
## wage
## Min. : 20.09
## 1st Qu.: 85.38
## Median :104.92
## Mean :111.70
## 3rd Qu.:128.68
## Max. :318.34
##
At this point, create training and test sets, just for the helluvit.
inTrain <- createDataPartition(y=Wage$wage, p=0.7, list=FALSE)
training <- Wage[inTrain,]
testing <- Wage[-inTrain,]
dim(training); dim(testing);## [1] 2102 11
## [1] 898 11
To begin exploration, a feature plot reveals relationships between the 3 features and the output
# caret::featurePlot()
featurePlot(x=training[, c("age", "education", "jobclass")], y=training$wage, plot="pairs") Each pair plot shows a mild trend with some outliers, suggesting that each feature may be used as a predictor.
Let’s take a closer look at age
# this first plot shows a VERY subtle trend
g1 <- ggplot(data=training) + aes(x=age, y=wage) + geom_point()
# we can plot a second feature by using color
g2 <- ggplot(data=training) + aes(x=age, y=wage, color=jobclass) + geom_point()
grid.arrange(ncol=2, g1, g2) The gray plot shows that high wage correlates with middle age, but not much else. Adding color shows that information jobs skew higher wages, and account for most of the high wage outliers.
What about education?
g3 <- ggplot(data=training) + aes(x=age, y=wage, color=education) + geom_point()
grid.arrange(ncol=2, g2, g3) Now we see that more education generally correlates with higher wage.
caret::train(formula= ) can take multiple features as x-arguments; it will then fit a model of the form
\[ y = b_0 + b_1*age + \sum_{j=1}^{J}b_2_j(jobclass_j) + \sum_{k=1}^{K}b_3_k(education_k) \]
modFit <- train(wage ~ age+jobclass+education, method="lm", data=training)
finMod <- modFit$finalModel
print(modFit)## Linear Regression
##
## 2102 samples
## 3 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 2102, 2102, 2102, 2102, 2102, 2102, ...
## Resampling results:
##
## RMSE Rsquared
## 36.06666 0.2517055
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
##
Each iteration sampled all 2102 rows of the training set. The 3 features have been expanded to 10 predictors to account for the multiple classes of jobclass and education.
The base plot() function will iterate through all the pairwise plots in the multivariate model (there are 6).
plot(finMod, which = 1, cex=0.5, col="#00000010") Outliers at top-left are labeled.
Sometimes, coloring by a variable not included in the model will show relationships.
ggplot(data=training) + aes(finMod$fitted, finMod$residuals, color=training$race) + geom_point() The highest outliers are all black, hmmm. What might that suggest?
Sometimes there will be systemic bias due to the row-ordering of a data set. Plotting residuals vs row-index will show this.
ggplot() + aes(x=1:2102, y=finMod$residuals) + geom_point() That doesn’t seem to be the case here, but when there is a trend, it suggests that a variable is missing from the model.
modFitAll <- train(wage ~ age+jobclass+education, method="lm", data=training)
pred <- predict(modFitAll, testing)
ggplot(data=testing) + aes(x=wage, y=pred) + geom_point() This plot shows the predicted wage that corresponds to each actual wage.